#include #include #include #include #include #include #define DO_C_PUT_LOOPS 1 #define RUNS 2 #define ITS 1000 #define SMALLITS 100 #define SIZE 1024 #define MAXTESTS 50 /* Variables used in tests */ struct big_obj_t { int x[SIZE]; }; int dummy; shared int sh_array[SIZE/2 * THREADS]; shared strict int sh_strict_array[SIZE/2 * THREADS]; shared relaxed int sh_relaxed_array[SIZE/2 * THREADS]; shared struct big_obj_t shared_big_obj[THREADS]; struct big_obj_t local_big_obj; /* This benchmark suite should be run with 2 threads: upcrun -n 2 ./bm Each test is run RUNS times. In each test, a 1024-int array is copied, loaded from, or stored to ITS times. The min, mean, and max time in microseconds are reported for each test. If the execution time is too long, the DO_C_PUT_LOOPS define above can be set to zero and the 2 C-loop put tests will be skipped. A very-advanced compiler could elide much of the code in the tests--the tests were written to be simple, not bulletproof, but could be amended if necessary. */ /* -------------------------------------------------------------------------------- | GET TESTS | -------------------------------------------------------------------------------- In these tests, thread 0 copies remote data from thread 1 to local memory. */ /* C Copy of 1024 ints from remote shared memory to local private memory. Should have same performance as upc_memget() test. Should be slightly faster than copying to local shared memory due to less bookkeeping. */ void c_copy_rshared_2_lprivate(void) { int i; if (MYTHREAD == 0) for (i = 0; i < ITS; i++) local_big_obj = shared_big_obj[1]; } /* C Copy of 1024 ints en masse from remote shared memory to local shared memory. Should have same performance as upc_memcpy() test. May be slightly slower than copies to local private memory. */ void c_copy_rshared_2_lshared(void) { int i; if (MYTHREAD == 0) for (i = 0; i < ITS; i++) shared_big_obj[MYTHREAD] = shared_big_obj[1]; } /* C loop copying 1024 ints one at at time from remote shared memory to local private memory. It should have same performance as upc_memget(), *if* the compiler unrolls and coalesces the copies. This is a common idiom in naively-written UPC code that is replaced with awkward optimized UPC code by clever programmers. */ void c_loopcopy_rshared_2_lprivate(void) { int i, j; if (MYTHREAD == 0) for (i = 0; i < ITS; i++) { for (j = 0; j < SIZE; j++) local_big_obj.x[j] = shared_big_obj[1].x[j]; } } /* This test determines if the compiler can coalesce communication but will not unroll these loops. This is similar to the idiom of copying elements of a large, contiguous structure. */ void c_loopcopy_unrolled_rshared_2_lprivate(void) { int i, j; if (MYTHREAD == 0) for (i = 0; i < ITS; i++) { for (j = 0; j < SIZE; j+=8) { local_big_obj.x[j+0] = shared_big_obj[1].x[j+0]; local_big_obj.x[j+1] = shared_big_obj[1].x[j+1]; local_big_obj.x[j+2] = shared_big_obj[1].x[j+2]; local_big_obj.x[j+3] = shared_big_obj[1].x[j+3]; local_big_obj.x[j+4] = shared_big_obj[1].x[j+4]; local_big_obj.x[j+5] = shared_big_obj[1].x[j+5]; local_big_obj.x[j+6] = shared_big_obj[1].x[j+6]; local_big_obj.x[j+7] = shared_big_obj[1].x[j+7]; local_big_obj.x[j+8] = shared_big_obj[1].x[j+8]; } } } /* This test uses upc_emget() to copy remote shared data to local private storage. It is used by advanced UPC programmers to get better performance, but should not be necessary. */ void upc_memget_rshared_2_lprivate(void) { int i; if (MYTHREAD == 0) for (i = 0; i < ITS; i++) upc_memget(&local_big_obj, &shared_big_obj[1], sizeof(local_big_obj)); } /* This test uses upc_memcpy() to copy remote shared data to local shared storage. It is used by advanced UPC programmers to get better performance, but should not be necessary. */ void upc_memcpy_rshared_2_lshared(void) { int i; if (MYTHREAD == 0) for (i = 0; i < ITS; i++) upc_memcpy(&shared_big_obj[0], &shared_big_obj[1], sizeof(local_big_obj)); } /* -------------------------------------------------------------------------------- | PUT TESTS | -------------------------------------------------------------------------------- In these tests, thread 0 copies local data to remote storage at thread 1. */ /* C Copy of 1024 ints from local private memory to remote shared memory. Should have same performance as upc_memput() test. Should be slightly faster than copying from local shared memory due to less bookkeeping. */ void c_copy_lprivate_2_rshared(void) { int i; if (MYTHREAD == 0) for (i = 0; i < ITS; i++) shared_big_obj[1] = local_big_obj; } /* C Copy of 1024 ints en masse from local shared memory to remote shared memory. Should have same performance as upc_memcpy() test. May be slightly slower than copies from local private memory. */ void c_copy_lshared_2_rshared(void) { int i; if (MYTHREAD == 0) for (i = 0; i < ITS; i++) shared_big_obj[1] = shared_big_obj[0]; } /* C loop copying 1024 ints one at at time from local private memory to remote shared memory. It should have same performance as upc_memput(), *if* the compiler unrolls and coalesces the copies. This is a common idiom in naively-written UPC code that is replaced with awkward optimized UPC code by clever programmers. */ void c_loopcopy_lprivate_2_rshared(void) { int i, j; if (MYTHREAD == 0) for (i = 0; i < ITS; i++) { for (j = 0; j < SIZE; j++) shared_big_obj[1].x[j] = local_big_obj.x[j]; } } /* This test is the same as the prior test, but the loop is unrolled 8 times. This test determines if the compiler can coalesce communication but will not unroll these loops. This is similar to the idiom of copying elements of a large, contiguous structure. */ void c_loopcopy_unrolled_lprivate_2_rshared(void) { int i, j; if (MYTHREAD == 0) for (i = 0; i < ITS; i++) { for (j = 0; j < SIZE; j+=8) { shared_big_obj[1].x[j+0] = local_big_obj.x[j+0]; shared_big_obj[1].x[j+1] = local_big_obj.x[j+1]; shared_big_obj[1].x[j+2] = local_big_obj.x[j+2]; shared_big_obj[1].x[j+3] = local_big_obj.x[j+3]; shared_big_obj[1].x[j+4] = local_big_obj.x[j+4]; shared_big_obj[1].x[j+5] = local_big_obj.x[j+5]; shared_big_obj[1].x[j+6] = local_big_obj.x[j+6]; shared_big_obj[1].x[j+7] = local_big_obj.x[j+7]; shared_big_obj[1].x[j+8] = local_big_obj.x[j+8]; } } } /* This test uses upc_memput() to copy local private data to remote shared storage. It is used by advanced UPC programmers to get better performance, but should not be necessary. */ void upc_memput_lprivate_2_rshared(void) { int i; if (MYTHREAD == 0) for (i = 0; i < ITS; i++) upc_memput(&shared_big_obj[1], &local_big_obj, sizeof(local_big_obj)); } /* This test uses upc_memcpy() to copy local shared data to remote shared storage. It is used by advanced UPC programmers to get better performance, but should not be necessary. */ void upc_memcpy_lshared_2_rshared(void) { int i; if (MYTHREAD == 0) for (i = 0; i < ITS; i++) upc_memcpy(&shared_big_obj[1], &shared_big_obj[0], sizeof(local_big_obj)); } /* -------------------------------------------------------------------------------- | POINTER TESTS | -------------------------------------------------------------------------------- These tests measure the performance of shared vs private pointer. */ /* This test measures the performance of local stores to shared data. If the compiler can privatize the pointer once, since it is not changed, the performance should be the same as in the subsequent test. */ void slocal_store(void) { int i, j; if (MYTHREAD == 0) for (i = 0; i < ITS; i++) { for (j = 0; j < SIZE; j++) shared_big_obj[MYTHREAD].x[j] = 1; } } /* This test measures the performance of local stores through a privatized shared pointer. */ void plocal_store(void) { int i, j; int* p = (int*) &shared_big_obj[MYTHREAD].x[0]; if (MYTHREAD == 0) for (i = 0; i < ITS; i++) { for (j = 0; j < SIZE; j++) p[j] = 1; } } /* This test measures the performance of local loads from shared data. */ void slocal_load(void) { int i, j, b; if (MYTHREAD == 0) for (i = 0; i < ITS; i++) for (j = 0; j < SIZE; j++) b += shared_big_obj[MYTHREAD].x[j]; dummy = b; } /* This test measures the performance of local loads through a privatized pointer. */ void plocal_load(void) { int i, j, b; int* p = (int*) &shared_big_obj[MYTHREAD].x[0]; if (MYTHREAD == 0) for (i = 0; i < ITS; i++) for (j = 0; j < SIZE; j++) b += p[j]; dummy = b; } /* -------------------------------------------------------------------------------- | AFFINITY OPTIMIZATIONS | -------------------------------------------------------------------------------- These tests evaluate the performance of the upc_forall() loop. */ /* In this test, an integer (j) is used to assign an iteration to a thread. */ void forall_int_test(void) { int i, j; if (MYTHREAD == 0) for (i = 0; i < ITS; i++) { upc_forall (j = 0; j < SIZE; j++; j) sh_array[j] = 1; } } /* In this test, a shared pointer (&sh_array[j]) is used to assign an iteration to a thread. */ void forall_ptr_test(void) { int i, j; if (MYTHREAD == 0) for (i = 0; i < ITS; i++) { upc_forall (j = 0; j < SIZE; j++; &sh_array[j]) sh_array[j] = 1; } } /* This code replaces the upc_forall() loop with a C loop. Advanced UPC programmers often write loops like this, but a good UPC compiler should generate equivalent code from a upc_forall() loop. */ void forall_hand_test(void) { int i, j; if (MYTHREAD == 0) for (i = 0; i < ITS; i++) { for (j = 0; j < SIZE; j+=THREADS) sh_array[j] = 1; } } /* -------------------------------------------------------------------------------- | RELAXED OPTIMIZATIONS | -------------------------------------------------------------------------------- These tests evaluate whether there are any compiler optimizations for overlapping accesses to relaxed shared variables. */ /* Each shared array operation in this test must be performed in serial. */ void strict_test(void) { int i, j; if (MYTHREAD == 0) for (i = 0; i < SMALLITS; i++) for (j = 0; j < SIZE; j += 8) { sh_strict_array[j+0*THREADS] = 1; sh_strict_array[j+1*THREADS] = 1; sh_strict_array[j+2*THREADS] = 1; sh_strict_array[j+3*THREADS] = 1; sh_strict_array[j+4*THREADS] = 1; sh_strict_array[j+5*THREADS] = 1; sh_strict_array[j+6*THREADS] = 1; sh_strict_array[j+7*THREADS] = 1; } } /* Each shared array operation in this test may be performed in parallel under relaxed UPC semantics. The loop is unrolled to give the compiler a better chance to perform this analysis. */ void relaxed_test(void) { int i, j; printf("inside relaxed test --- last test\n"); if (MYTHREAD == 0) for (i = 0; i < SMALLITS; i++) for (j = 0; j < SIZE; j += 8) { sh_relaxed_array[j+0*THREADS] = 1; sh_relaxed_array[j+1*THREADS] = 1; sh_relaxed_array[j+2*THREADS] = 1; sh_relaxed_array[j+3*THREADS] = 1; sh_relaxed_array[j+4*THREADS] = 1; sh_relaxed_array[j+5*THREADS] = 1; sh_relaxed_array[j+6*THREADS] = 1; sh_relaxed_array[j+7*THREADS] = 1; } } /* END OF TESTS Boring timing functions, etc., below. */ /* Variables & functions for executing tests and measuring performance */ #define RS "remote shared " #define LS "local shared " #define LP "local private" typedef void (*fptr_t) (void); fptr_t tests[MAXTESTS]; char* test_strings[MAXTESTS]; int ntests; int Times[MAXTESTS][RUNS]; suseconds_t t_ovhd; void calibrate_timer(void) { struct timeval t1, t2; gettimeofday(&t1, NULL); gettimeofday(&t2, NULL); t_ovhd = (t2.tv_usec - t1.tv_usec) + (t2.tv_sec - t1.tv_sec) * 1000000; #if 0 if (MYTHREAD == 0) fprintf(stderr, "timing overhead is %d\n", t_ovhd); #endif } void print_results(void) { int min, max, sum, sum2, i, j; char* p; if (MYTHREAD == 0) { for (i = 0; i < ntests; i++) { min = Times[i][1]; max = Times[i][1]; sum = 0; sum2 = 0; for (j = 1; j < RUNS; j++) { if (Times[i][j] < min) min = Times[i][j]; if (Times[i][j] > max) max = Times[i][j]; sum += Times[i][j]; sum2 += Times[i][j] * Times[i][j]; } printf("%s: %5d, %5d, %5d\n", test_strings[i], min, sum/(RUNS-1), max); // (int)(sqrt((float)(sum2 - (sum * sum) / (RUNS-1)) / (RUNS-1) ))); } } } int run_test(fptr_t func) { struct timeval t1, t2; gettimeofday(&t1, NULL); (*func)(); gettimeofday(&t2, NULL); return (t2.tv_usec - t1.tv_usec - t_ovhd) + (t2.tv_sec - t1.tv_sec) * 1000000; } void run_tests(void) { int i, j; for (i = 0; i < RUNS; i++) { for (j = 0; j < ntests; j++) Times[j][i] = run_test(tests[j]); } } void add_test(fptr_t func, char* desc) { if (ntests >= MAXTESTS) { fprintf(stderr, "too many tests, increase MAXTESTS\n"); exit(1); } tests[ntests] = func; test_strings[ntests] = malloc(strlen(desc) + 1); strcpy(test_strings[ntests], desc); ntests++; } void add_access_test(fptr_t func, char* desc1, char* desc2) { char desc[1000]; snprintf(desc, sizeof(desc), "%s: %s", desc1, desc2); add_test(func, desc); } void add_copy_test(fptr_t func, char* desc1, char* desc2, char* desc3) { char desc[1000]; snprintf(desc, sizeof(desc), "%s => %s", desc2, desc3); add_access_test(func, desc1, desc); } int main(int argc, char* argv[]) { int time; int i; int Times[25][RUNS]; if (THREADS < 2) { fprintf(stderr, "MUST BE RUN WITH AT LEAST 2 THREADS\n"); exit(1); } printf("about to call upc barrier\n"); upc_barrier; printf("after call to upc barrier\n"); calibrate_timer(); printf("after call to callibrate timer\n"); add_copy_test(c_copy_rshared_2_lprivate, "GET TESTS\n" "C copy ", RS, LP); add_copy_test(c_copy_rshared_2_lshared, "C copy ", RS, LS); add_copy_test(c_loopcopy_rshared_2_lprivate, "C loop copy ", RS, LP); add_copy_test(c_loopcopy_unrolled_rshared_2_lprivate, "C unrolled loop copy", RS, LP); add_copy_test(upc_memget_rshared_2_lprivate, "upc_memget ", RS, LP); add_copy_test(upc_memcpy_rshared_2_lshared, "upc_memcpy ", RS, LS); add_copy_test(c_copy_lprivate_2_rshared, "PUT TESTS\n" "C copy ", LP, RS); add_copy_test(c_copy_lshared_2_rshared, "C copy ", LS, RS); #if DO_C_PUT_LOOPS add_copy_test(c_loopcopy_lprivate_2_rshared, "C loop copy ", LP, RS); add_copy_test(c_loopcopy_unrolled_lprivate_2_rshared, "C unrolled loop copy", LP, RS); #endif add_copy_test(upc_memput_lprivate_2_rshared, "upc_memput ", LP, RS); add_copy_test(upc_memcpy_lshared_2_rshared, "upc_memcpy ", LS, RS); add_access_test(slocal_store, "LOCAL POINTER TESTS\nC store loop", LS); add_access_test(plocal_store, "C store loop", LP); add_access_test(slocal_load, "C load loop", LS); add_access_test(plocal_load, "C load loop", LP); add_test(forall_int_test, "FORALL AFFINITY TESTS\naffinity by integer"); add_test(forall_ptr_test, "affinity by pointer"); add_test(forall_hand_test, "affinity by hand "); add_test(strict_test, "STRICT/RELAXED TESTS\nstrict test "); add_test(relaxed_test, "relaxed test"); printf("about to callrun tests, val of ntests is %d\n",ntests); run_tests(); printf("about to call print results\n"); print_results(); exit(0); }